Image.jpeg

In [1]:
#Data Analysis
import numpy as np
import pandas as pd

#Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
%matplotlib inline
In [2]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

pd.set_option('display.max_columns', None)
matches = pd.read_csv(r"E:\Coding Nest\meetup\IPL2008-2019\datasets\matches.csv")
deliveries = pd.read_csv(r"E:\Coding Nest\meetup\IPL2008-2019\datasets\deliveries.csv")
In [3]:
print(matches.shape, deliveries.shape)
(756, 18) (179078, 21)
In [4]:
matches.head()
Out[4]:
id season city date team1 team2 toss_winner toss_decision result dl_applied winner win_by_runs win_by_wickets player_of_match venue umpire1 umpire2 umpire3
0 1 2017 Hyderabad 2017-04-05 Sunrisers Hyderabad Royal Challengers Bangalore Royal Challengers Bangalore field normal 0 Sunrisers Hyderabad 35 0 Yuvraj Singh Rajiv Gandhi International Stadium, Uppal AY Dandekar NJ Llong NaN
1 2 2017 Pune 2017-04-06 Mumbai Indians Rising Pune Supergiant Rising Pune Supergiant field normal 0 Rising Pune Supergiant 0 7 SPD Smith Maharashtra Cricket Association Stadium A Nand Kishore S Ravi NaN
2 3 2017 Rajkot 2017-04-07 Gujarat Lions Kolkata Knight Riders Kolkata Knight Riders field normal 0 Kolkata Knight Riders 0 10 CA Lynn Saurashtra Cricket Association Stadium Nitin Menon CK Nandan NaN
3 4 2017 Indore 2017-04-08 Rising Pune Supergiant Kings XI Punjab Kings XI Punjab field normal 0 Kings XI Punjab 0 6 GJ Maxwell Holkar Cricket Stadium AK Chaudhary C Shamshuddin NaN
4 5 2017 Bangalore 2017-04-08 Royal Challengers Bangalore Delhi Daredevils Royal Challengers Bangalore bat normal 0 Royal Challengers Bangalore 15 0 KM Jadhav M Chinnaswamy Stadium NaN NaN NaN
In [5]:
deliveries.head()
Out[5]:
match_id inning batting_team bowling_team over ball batsman non_striker bowler is_super_over wide_runs bye_runs legbye_runs noball_runs penalty_runs batsman_runs extra_runs total_runs player_dismissed dismissal_kind fielder
0 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 1 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 0 0 0 NaN NaN NaN
1 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 2 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 0 0 0 NaN NaN NaN
2 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 3 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 4 0 4 NaN NaN NaN
3 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 4 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 0 0 0 NaN NaN NaN
4 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 5 DA Warner S Dhawan TS Mills 0 2 0 0 0 0 0 2 2 NaN NaN NaN
In [6]:
matches.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               756 non-null    int64 
 1   season           756 non-null    int64 
 2   city             749 non-null    object
 3   date             756 non-null    object
 4   team1            756 non-null    object
 5   team2            756 non-null    object
 6   toss_winner      756 non-null    object
 7   toss_decision    756 non-null    object
 8   result           756 non-null    object
 9   dl_applied       756 non-null    int64 
 10  winner           752 non-null    object
 11  win_by_runs      756 non-null    int64 
 12  win_by_wickets   756 non-null    int64 
 13  player_of_match  752 non-null    object
 14  venue            756 non-null    object
 15  umpire1          754 non-null    object
 16  umpire2          754 non-null    object
 17  umpire3          119 non-null    object
dtypes: int64(5), object(13)
memory usage: 106.4+ KB
In [7]:
deliveries.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179078 entries, 0 to 179077
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   match_id          179078 non-null  int64 
 1   inning            179078 non-null  int64 
 2   batting_team      179078 non-null  object
 3   bowling_team      179078 non-null  object
 4   over              179078 non-null  int64 
 5   ball              179078 non-null  int64 
 6   batsman           179078 non-null  object
 7   non_striker       179078 non-null  object
 8   bowler            179078 non-null  object
 9   is_super_over     179078 non-null  int64 
 10  wide_runs         179078 non-null  int64 
 11  bye_runs          179078 non-null  int64 
 12  legbye_runs       179078 non-null  int64 
 13  noball_runs       179078 non-null  int64 
 14  penalty_runs      179078 non-null  int64 
 15  batsman_runs      179078 non-null  int64 
 16  extra_runs        179078 non-null  int64 
 17  total_runs        179078 non-null  int64 
 18  player_dismissed  8834 non-null    object
 19  dismissal_kind    8834 non-null    object
 20  fielder           6448 non-null    object
dtypes: int64(13), object(8)
memory usage: 28.7+ MB

Data Cleaning

In [8]:
#city column
matches[matches["city"].isna()]
Out[8]:
id season city date team1 team2 toss_winner toss_decision result dl_applied winner win_by_runs win_by_wickets player_of_match venue umpire1 umpire2 umpire3
461 462 2014 NaN 2014-04-19 Mumbai Indians Royal Challengers Bangalore Royal Challengers Bangalore field normal 0 Royal Challengers Bangalore 0 7 PA Patel Dubai International Cricket Stadium Aleem Dar AK Chaudhary NaN
462 463 2014 NaN 2014-04-19 Kolkata Knight Riders Delhi Daredevils Kolkata Knight Riders bat normal 0 Delhi Daredevils 0 4 JP Duminy Dubai International Cricket Stadium Aleem Dar VA Kulkarni NaN
466 467 2014 NaN 2014-04-23 Chennai Super Kings Rajasthan Royals Rajasthan Royals field normal 0 Chennai Super Kings 7 0 RA Jadeja Dubai International Cricket Stadium HDPK Dharmasena RK Illingworth NaN
468 469 2014 NaN 2014-04-25 Sunrisers Hyderabad Delhi Daredevils Sunrisers Hyderabad bat normal 0 Sunrisers Hyderabad 4 0 AJ Finch Dubai International Cricket Stadium M Erasmus S Ravi NaN
469 470 2014 NaN 2014-04-25 Mumbai Indians Chennai Super Kings Mumbai Indians bat normal 0 Chennai Super Kings 0 7 MM Sharma Dubai International Cricket Stadium BF Bowden M Erasmus NaN
474 475 2014 NaN 2014-04-28 Royal Challengers Bangalore Kings XI Punjab Kings XI Punjab field normal 0 Kings XI Punjab 0 5 Sandeep Sharma Dubai International Cricket Stadium BF Bowden S Ravi NaN
476 477 2014 NaN 2014-04-30 Sunrisers Hyderabad Mumbai Indians Mumbai Indians field normal 0 Sunrisers Hyderabad 15 0 B Kumar Dubai International Cricket Stadium HDPK Dharmasena M Erasmus NaN
In [9]:
matches.loc[matches["city"].isna(), 'city'] = 'Dubai'
In [10]:
#winner column
matches[matches["winner"].isna()]
Out[10]:
id season city date team1 team2 toss_winner toss_decision result dl_applied winner win_by_runs win_by_wickets player_of_match venue umpire1 umpire2 umpire3
300 301 2011 Delhi 2011-05-21 Delhi Daredevils Pune Warriors Delhi Daredevils bat no result 0 NaN 0 0 NaN Feroz Shah Kotla SS Hazare RJ Tucker NaN
545 546 2015 Bangalore 2015-04-29 Royal Challengers Bangalore Rajasthan Royals Rajasthan Royals field no result 0 NaN 0 0 NaN M Chinnaswamy Stadium JD Cloete PG Pathak NaN
570 571 2015 Bangalore 2015-05-17 Delhi Daredevils Royal Challengers Bangalore Royal Challengers Bangalore field no result 0 NaN 0 0 NaN M Chinnaswamy Stadium HDPK Dharmasena K Srinivasan NaN
744 11340 2019 Bengaluru 30/04/19 Royal Challengers Bangalore Rajasthan Royals Rajasthan Royals field no result 0 NaN 0 0 NaN M. Chinnaswamy Stadium Nigel Llong Ulhas Gandhe Anil Chaudhary
In [11]:
# winner and player_of_match columns
matches.loc[matches["winner"].isna(), 'winner'] = 'No Winner'
matches.loc[matches["player_of_match"].isna(),"player_of_match"] = "No Player_of_match"
In [12]:
# umpire1 and umpire2 column
matches[matches["umpire1"].isna()]
Out[12]:
id season city date team1 team2 toss_winner toss_decision result dl_applied winner win_by_runs win_by_wickets player_of_match venue umpire1 umpire2 umpire3
4 5 2017 Bangalore 2017-04-08 Royal Challengers Bangalore Delhi Daredevils Royal Challengers Bangalore bat normal 0 Royal Challengers Bangalore 15 0 KM Jadhav M Chinnaswamy Stadium NaN NaN NaN
753 11413 2019 Visakhapatnam 08/05/19 Sunrisers Hyderabad Delhi Capitals Delhi Capitals field normal 0 Delhi Capitals 0 2 RR Pant ACA-VDCA Stadium NaN NaN NaN
In [13]:
matches.loc[(matches["umpire1"].isna()) | (matches["id"]==5), ['umpire1','umpire2']] = ["S Ravi","VK Sharma"]
matches.loc[(matches["umpire1"].isna()) | (matches["id"]==11413), ['umpire1','umpire2']] = ['Bruce Oxenford' , 'Sundaram Ravi']            
In [14]:
matches.drop("umpire3",axis=1,inplace=True)
In [15]:
matches.replace(to_replace='Rising Pune Supergiant', value='Rising Pune Supergiants', inplace=True)
deliveries.replace(to_replace='Rising Pune Supergiant', value='Rising Pune Supergiants', inplace=True)

matches.replace(to_replace='Delhi Capitals', value='Delhi Daredevils', inplace=True)
deliveries.replace(to_replace='Delhi Capitals', value='Delhi Daredevils', inplace=True)
In [16]:
matches.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               756 non-null    int64 
 1   season           756 non-null    int64 
 2   city             756 non-null    object
 3   date             756 non-null    object
 4   team1            756 non-null    object
 5   team2            756 non-null    object
 6   toss_winner      756 non-null    object
 7   toss_decision    756 non-null    object
 8   result           756 non-null    object
 9   dl_applied       756 non-null    int64 
 10  winner           756 non-null    object
 11  win_by_runs      756 non-null    int64 
 12  win_by_wickets   756 non-null    int64 
 13  player_of_match  756 non-null    object
 14  venue            756 non-null    object
 15  umpire1          756 non-null    object
 16  umpire2          756 non-null    object
dtypes: int64(5), object(12)
memory usage: 100.5+ KB

EDA

Total Number of matches in every season

In [17]:
total_matches_season = matches["season"].value_counts().reset_index()
total_matches_season.columns = ["Season","Matches"]
sns.barplot(x='Season',y="Matches", data=total_matches_season,palette='magma').set(title="Total number of matches in each season");

Matches Played vs Win by each Team

In [18]:
matches_played = pd.concat([matches["team1"],matches["team2"]], axis=0)
matches_played = matches_played.value_counts().reset_index()
matches_played.columns = ["Team","Total Matches Played"]
matches_played["winner"] = matches["winner"].value_counts().reset_index()["winner"]
matches_played["winning_percentage"] = matches_played["winner"]/matches_played["Total Matches Played"]
matches_played
Out[18]:
Team Total Matches Played winner winning_percentage
0 Mumbai Indians 187 109 0.582888
1 Royal Challengers Bangalore 180 100 0.555556
2 Kolkata Knight Riders 178 92 0.516854
3 Delhi Daredevils 177 84 0.474576
4 Kings XI Punjab 176 82 0.465909
5 Chennai Super Kings 164 77 0.469512
6 Rajasthan Royals 147 75 0.510204
7 Sunrisers Hyderabad 108 58 0.537037
8 Deccan Chargers 75 29 0.386667
9 Pune Warriors 46 15 0.326087
10 Gujarat Lions 30 13 0.433333
11 Rising Pune Supergiants 30 12 0.400000
12 Kochi Tuskers Kerala 14 6 0.428571
In [19]:
fig,axes = plt.subplots(nrows=1,ncols=3,figsize=(24, 6))
sns.barplot(x='Team',y='Total Matches Played',data=matches_played, ax=axes[0]).set_xticklabels(
    matches_played["Team"],rotation=90)
sns.barplot(x='Team',y='winner',data=matches_played, ax=axes[1]).set_xticklabels(
    matches_played["Team"],rotation=90)
sns.barplot(x='Team',y='winning_percentage',data=matches_played, ax=axes[2]).set_xticklabels(
    matches_played["Team"],rotation=90);

which city has hosted max. number of matches?

In [20]:
matches_city = matches['city'].value_counts().reset_index().sort_values(by='city', ascending=False)
matches_city.columns = ['City','No. of Matches']
sns.barplot(x='City',y='No. of Matches',data=matches_city).set_xticklabels(
    matches_city["City"],rotation=90);

Number of matches hosted in each stadium

In [21]:
fig, axes = plt.subplots(figsize=(15, 8))
matches_venue = matches["venue"].value_counts().reset_index().sort_values(by='venue', ascending=False)
matches_venue.columns = ["Venue",'No. of Matches']
sns.barplot(x='Venue',y='No. of Matches',data=matches_venue).set_xticklabels(
    matches_venue["Venue"],rotation=90);

Total and Average runs per season

In [22]:
runs = matches.merge(deliveries, left_on = 'id', right_on = 'match_id', how = 'left').drop('id', axis = 1)
total_runs_season = runs.groupby(['season'])['total_runs'].sum().reset_index()
matches_season = matches.groupby(['season']).count()["id"].reset_index()
matches_season.rename(columns={'id':'matches'},inplace=True)
matches_season["total_runs"] = total_runs_season["total_runs"]
matches_season["average_runs_per_match"] = matches_season["total_runs"]/ matches_season['matches']
matches_season
Out[22]:
season matches total_runs average_runs_per_match
0 2008 58 17937 309.258621
1 2009 57 16353 286.894737
2 2010 60 18883 314.716667
3 2011 73 21154 289.780822
4 2012 74 22453 303.418919
5 2013 76 22602 297.394737
6 2014 60 18931 315.516667
7 2015 59 18353 311.067797
8 2016 60 18862 314.366667
9 2017 59 18786 318.406780
10 2018 60 20706 345.100000
11 2019 60 20270 337.833333
In [23]:
fig,axes = plt.subplots(nrows=2,ncols=1,figsize=(15, 10))
sns.barplot(x='season',y='total_runs',data=matches_season, ax=axes[0],palette='magma').set_xticklabels(
    matches_season["season"],rotation=45)
sns.barplot(x='season',y='average_runs_per_match',data=matches_season, ax=axes[1],palette='magma').set_xticklabels(
    matches_season["season"],rotation=45);

Number of runs scored as percentage of Boundaries

In [24]:
deliveries['boundaries'] = 0

deliveries.loc[(deliveries['batsman_runs'] >= 4) & (deliveries['batsman_runs'] != 6),'boundaries'] = 4
deliveries.loc[(deliveries['batsman_runs'] >= 6),'boundaries'] = 6

percentage_runs_boundaries = pd.merge(
    deliveries[['batsman', 'batsman_runs']].groupby(by='batsman').sum().reset_index(),
    deliveries[['batsman', 'boundaries']].groupby(by='batsman').sum().reset_index(),
    how='outer')

percentage_runs_boundaries['run%'] = np.round(percentage_runs_boundaries['boundaries']*100/percentage_runs_boundaries['batsman_runs'],2)
percentage_runs_boundaries = percentage_runs_boundaries.sort_values(by='run%', ascending=False)

percentage_runs_boundaries.dropna(inplace=True)
percentage_runs_boundaries = percentage_runs_boundaries.sort_values(by="batsman_runs", ascending=False).head(20)
plt.figure(figsize=[20,10])
plt.bar(percentage_runs_boundaries['batsman'], percentage_runs_boundaries['batsman_runs'], label='Batsman Runs', bottom=0)
plt.bar(percentage_runs_boundaries['batsman'], percentage_runs_boundaries['boundaries'], label='Boundary Runs', bottom=0)
plt.title('Highest Number of Runs scored as Percentage of Boundaries')
plt.xlabel('Batsman')
plt.ylabel('Runs Scored')
plt.legend()
plt.xticks(rotation=45)
plt.show()

Fantasy League

IPL-fantasy-league-2019-1024x560.jpg

Batsman Stats

In [25]:
#Man_of_match
man_of_match = matches.groupby(matches["player_of_match"])["player_of_match"].count().sort_values(ascending=False).head(10)
man_of_match = man_of_match.to_frame().rename(columns={'player_of_match':'times'}).reset_index()

#century
century = deliveries.groupby(['batsman','match_id']).agg({'batsman_runs':'sum'})
century = century[century['batsman_runs']>=100]
century = century.groupby(['batsman']).agg({'count'})
century.columns = century.columns.droplevel()
century = century.sort_values(by='count',ascending=False).reset_index()

#half-century
half_century = deliveries.groupby(['batsman','match_id']).agg({'batsman_runs':'sum'})
half_century = half_century[half_century['batsman_runs']>=50]
half_century = half_century[half_century['batsman_runs']<100]
half_century = half_century.groupby(['batsman']).agg({'count'})
half_century.columns = half_century.columns.droplevel()
half_century = half_century.sort_values(by='count',ascending=False).reset_index()

#run-30's
run_30 = deliveries.groupby(['batsman','match_id']).agg({'batsman_runs':'sum'})
runs_30 = run_30[run_30['batsman_runs']>=30]
runs_30 = runs_30[runs_30['batsman_runs']<50]
runs_30 = runs_30.groupby(['batsman']).agg({'count'})
runs_30.columns = runs_30.columns.droplevel()
runs_30 = runs_30.sort_values(by='count',ascending=False).reset_index()

run_stat = pd.merge(century,half_century, on='batsman',how='right')
run_stat = pd.merge(run_stat, runs_30, on='batsman',how='right')
run_stat = run_stat.fillna(0)

#Strike Rate
strike_rate = deliveries.groupby(['batsman']).agg({'ball':'count','batsman_runs':'mean'}).sort_values(by='batsman_runs',ascending=False)
strike_rate.rename(columns ={'batsman_runs' : 'strike rate'}, inplace=True)

#Runs per match
runs_per_match = deliveries.groupby(['batsman','match_id']).agg({'batsman_runs':'sum'})

#Total Runs
total_runs = runs_per_match.groupby(['batsman']).agg({'sum' ,'mean','count'})
total_runs.rename(columns ={'sum' : 'batsman run','count' : 'match count','mean' :'average score'}, inplace=True)
total_runs.columns = total_runs.columns.droplevel()

#Sixes
sixes = deliveries[['batsman','batsman_runs']][deliveries.batsman_runs==6].groupby(['batsman']).agg({'batsman_runs':'count'})

#Fours
four = deliveries[['batsman','batsman_runs']][deliveries.batsman_runs==4].groupby(['batsman']).agg({'batsman_runs':'count'})

#Batsman stats
batsman_stat = pd.merge(pd.merge(pd.merge(strike_rate,total_runs, left_index=True, right_index=True),
                                    sixes, left_index=True, right_index=True),four, left_index=True, 
                        right_index=True)

batsman_stat.rename(columns = {
    'ball' : 'Ball', 'strike rate':'Strike Rate',
    'batsman run' : 'Batsman Run','match count' : 'Match Count',
    'average score' : 'Average score' ,
    'batsman_runs_x' :'Six','batsman_runs_y':'Four'
},inplace=True)

batsman_stat['Strike Rate'] = batsman_stat['Strike Rate']*100
batsman_stat = batsman_stat.sort_values(by='Batsman Run',ascending=False).reset_index()

batsman_stats = pd.merge(batsman_stat, run_stat, on='batsman', how='left').fillna(0)
batsman_stats.rename(columns = {'count_x' : '100s', 'count_y' : '50s', 'count':'30s'},inplace=True)
batsman_stats
Out[25]:
batsman Ball Strike Rate Average score Match Count Batsman Run Six Four 100s 50s 30s
0 V Kohli 4211 129.042983 32.153846 169 5434 191 482 5.0 38.0 34.0
1 SK Raina 4044 133.902077 28.650794 189 5415 195 495 1.0 38.0 37.0
2 RG Sharma 3816 128.773585 27.000000 182 4914 194 431 1.0 36.0 31.0
3 DA Warner 3398 139.523249 37.626984 126 4741 181 459 4.0 44.0 19.0
4 S Dhawan 3776 122.669492 29.316456 158 4632 96 526 0.0 37.0 26.0
... ... ... ... ... ... ... ... ... ... ... ...
317 BJ Haddin 11 163.636364 18.000000 1 18 1 2 0.0 0.0 0.0
318 JJ van der Wath 17 105.882353 6.000000 3 18 1 1 0.0 0.0 0.0
319 S Singh 18 94.444444 17.000000 1 17 1 1 0.0 0.0 0.0
320 SD Lad 13 115.384615 15.000000 1 15 1 1 0.0 0.0 0.0
321 SA Abbott 13 115.384615 7.500000 2 15 1 1 0.0 0.0 0.0

322 rows × 11 columns

In [26]:
batsman_stats["total_points"] = batsman_stats["100s"]*8 + batsman_stats["50s"]*4 + batsman_stats["30s"]*2 \
                                + batsman_stats["Six"]*1 + batsman_stats["Four"]*.5 \
                                + batsman_stats["Batsman Run"] *.25
batsman_stats.head()
Out[26]:
batsman Ball Strike Rate Average score Match Count Batsman Run Six Four 100s 50s 30s total_points
0 V Kohli 4211 129.042983 32.153846 169 5434 191 482 5.0 38.0 34.0 2050.50
1 SK Raina 4044 133.902077 28.650794 189 5415 195 495 1.0 38.0 37.0 2030.25
2 RG Sharma 3816 128.773585 27.000000 182 4914 194 431 1.0 36.0 31.0 1852.00
3 DA Warner 3398 139.523249 37.626984 126 4741 181 459 4.0 44.0 19.0 1841.75
4 S Dhawan 3776 122.669492 29.316456 158 4632 96 526 0.0 37.0 26.0 1717.00
In [27]:
best_batsman = batsman_stats[["batsman","total_points"]]

Fielder Stats

In [28]:
condition_catch = (deliveries["dismissal_kind"] == 'caught')
condition_run= (deliveries["dismissal_kind"] == 'run out')
condition_stump= (deliveries["dismissal_kind"] == 'stumped')
condition_caught_bowled = (deliveries["dismissal_kind"] == 'caught and bowled')

s_catch = deliveries.loc[condition_catch,:].groupby(deliveries.fielder).dismissal_kind.count().sort_values(ascending=False)
s_run = deliveries.loc[condition_run,:].groupby(deliveries.fielder).dismissal_kind.count().sort_values(ascending=False)
s_stump = deliveries.loc[condition_stump,:].groupby(deliveries.fielder).dismissal_kind.count().sort_values(ascending=False)
s_caught_bowled = deliveries.loc[condition_caught_bowled,:].groupby(deliveries.bowler).dismissal_kind.count().sort_values(ascending=False)

catch= s_catch.to_frame().reset_index().rename(columns ={'dismissal_kind' : 'catch'})
run= s_run.to_frame().reset_index().rename(columns ={'dismissal_kind' : 'run_out'})
stump= s_stump.to_frame().reset_index().rename(columns ={'dismissal_kind' : 'stump'})
caught_bowled = s_caught_bowled.to_frame().reset_index().rename(columns ={'dismissal_kind' : 'caught and bowled'})  
field = pd.merge(pd.merge(catch,run,on='fielder', how='outer'),stump,on='fielder',how='outer')
field_stats = field[~field['fielder'].str.contains("(sub)")].reset_index().drop(['index'],axis=1).fillna(0)
field_stats
Out[28]:
fielder catch run_out stump
0 KD Karthik 109.0 14.0 29.0
1 SK Raina 99.0 16.0 0.0
2 MS Dhoni 98.0 23.0 38.0
3 AB de Villiers 93.0 14.0 7.0
4 RV Uthappa 84.0 9.0 32.0
... ... ... ... ...
444 WPUJC Vaas 0.0 1.0 0.0
445 Umar Gul 0.0 1.0 0.0
446 LH Ferguson 0.0 1.0 0.0
447 Vishnu Vinod 0.0 0.0 2.0
448 KH Devdhar 0.0 0.0 1.0

449 rows × 4 columns

In [29]:
field_stats["total_points"] = field_stats["catch"]*3 + field_stats["run_out"]*2 + field_stats["stump"]*1
field_stats.head()
Out[29]:
fielder catch run_out stump total_points
0 KD Karthik 109.0 14.0 29.0 384.0
1 SK Raina 99.0 16.0 0.0 329.0
2 MS Dhoni 98.0 23.0 38.0 378.0
3 AB de Villiers 93.0 14.0 7.0 314.0
4 RV Uthappa 84.0 9.0 32.0 302.0
In [30]:
best_fielder = field_stats[["fielder","total_points"]]

Bowler Stats

In [31]:
condition = ((deliveries["dismissal_kind"].notnull()) & 
            (deliveries["dismissal_kind"] != 'run out') &
            (deliveries["dismissal_kind"] != 'retired hurt' ) & 
            (deliveries["dismissal_kind"] != 'hit wicket') & 
            (deliveries["dismissal_kind"] != 'obstructing the field') &
            (deliveries["dismissal_kind"] != 'caught and bowled'))
        
df_bowlers = deliveries.loc[condition,:].groupby(deliveries["bowler"])["dismissal_kind"].count().sort_values(ascending=False).reset_index()
bowlers = pd.merge(df_bowlers, caught_bowled,on='bowler',how='left').fillna(0)

high = deliveries.groupby(['match_id', 'bowler']).agg({'total_runs':'sum'}).reset_index()
over_count = deliveries.groupby(['match_id', 'bowler','over']).agg({'total_runs':'sum'}).reset_index()
overs = over_count.groupby(['match_id','bowler']).agg({'over':'count'}).reset_index()
overs = overs[overs['over']>=2]

bowlers = pd.merge( high, overs,on=['match_id', 'bowler'], how='right')

bowlers['economy'] = bowlers['total_runs']/bowlers['over']
bowlers['eco_range'] = pd.cut(bowlers['economy'], [0, 4, 5, 6, 9, 10, 11, 30], 
                              labels=['below4', '4-5', '5-6', '6-9','9-10','10-11','above11'])

bowlers = pd.concat([bowlers, pd.get_dummies(bowlers['eco_range'], prefix='eco')], axis=1)

economy_rates = bowlers.groupby(['bowler']).agg(
    {'eco_below4':'sum',
     'eco_4-5':'sum',
     'eco_5-6':'sum',
     'eco_6-9':'sum',
     'eco_9-10':'sum',
     'eco_10-11':'sum',
     'eco_above11':'sum'}).reset_index()

maiden_over = over_count[over_count['total_runs']==0]
maidens = maiden_over['bowler'].value_counts().to_frame().reset_index().rename({
    'index':'bowler',
    'bowler':'maiden_overs'},axis=1)

hauls=deliveries.groupby(['match_id', 'bowler']).agg({'player_dismissed':'count'}).reset_index()
hauls = hauls[hauls['player_dismissed']>=3]
hauls['haul'] = pd.cut(hauls['player_dismissed'], [0,3,8], labels=['3', '5'])
hauls = pd.concat([hauls,pd.get_dummies(hauls['haul'], prefix='haul')],axis=1)
hauls.drop(['player_dismissed','haul'],inplace=True,axis=1)
hauls=hauls.groupby(['bowler']).agg({'haul_3':'sum','haul_5':'sum'}).reset_index()

bowlers_stats = pd.merge(pd.merge(pd.merge(economy_rates,maidens,on='bowler', how='left'),df_bowlers,on='bowler',how='left'),hauls,on='bowler',how='right').fillna(0)
bowlers_stats.rename(columns ={'dismissal_kind' : 'wickets'},inplace=True)
bowlers_stats
Out[31]:
bowler eco_below4 eco_4-5 eco_5-6 eco_6-9 eco_9-10 eco_10-11 eco_above11 maiden_overs wickets haul_3 haul_5
0 A Ashish Reddy 1 0 0 8 2 2 2 0.0 17.0 1 0
1 A Chandila 2 2 4 1 1 0 2 0.0 6.0 0 1
2 A Joseph 0 1 0 0 0 0 2 1.0 6.0 0 1
3 A Kumble 5 9 5 16 5 0 2 3.0 44.0 1 4
4 A Mishra 15 21 14 51 15 15 15 4.0 153.0 14 5
... ... ... ... ... ... ... ... ... ... ... ... ...
196 YA Abdulla 0 1 1 4 2 0 2 1.0 15.0 1 2
197 YK Pathan 3 5 13 25 4 5 4 1.0 40.0 3 0
198 YS Chahal 4 6 7 37 10 6 11 3.0 99.0 6 2
199 Yuvraj Singh 4 5 7 24 1 3 0 0.0 34.0 3 2
200 Z Khan 3 9 8 59 9 2 8 5.0 101.0 9 3

201 rows × 12 columns

In [32]:
bowlers_stats["total_points"] = bowlers_stats["haul_5"]*8 + bowlers_stats["haul_3"]*4 \
                                + bowlers_stats["wickets"]*2 + bowlers_stats["maiden_overs"]*1\
                                + bowlers_stats["eco_below4"]*.5
bowlers_stats.head()
Out[32]:
bowler eco_below4 eco_4-5 eco_5-6 eco_6-9 eco_9-10 eco_10-11 eco_above11 maiden_overs wickets haul_3 haul_5 total_points
0 A Ashish Reddy 1 0 0 8 2 2 2 0.0 17.0 1 0 38.5
1 A Chandila 2 2 4 1 1 0 2 0.0 6.0 0 1 21.0
2 A Joseph 0 1 0 0 0 0 2 1.0 6.0 0 1 21.0
3 A Kumble 5 9 5 16 5 0 2 3.0 44.0 1 4 129.5
4 A Mishra 15 21 14 51 15 15 15 4.0 153.0 14 5 413.5
In [33]:
best_bowler = bowlers_stats[["bowler","total_points"]]
In [34]:
best_batsman = best_batsman.rename(columns={"batsman": "player"})
best_bowler = best_bowler.rename(columns={"bowler": "player"})
best_fielder = best_fielder.rename(columns={"fielder": "player"})

Best Players for Fantasy Team

In [35]:
best_player = pd.merge(pd.merge(best_batsman,best_bowler,on='player',how='outer'),best_fielder,on='player',how='outer')

best_player = best_player.fillna(0)
best_player
best_player['points'] = best_player['total_points']+best_player['total_points_x']\
                        + best_player['total_points_y']
best_player.sort_values(by='points',ascending=False,inplace=True)
best_player=best_player.reset_index().drop(['index'],axis=1)

best_player = best_player.head(20)
best_player
Out[35]:
player total_points_x total_points_y total_points points
0 SK Raina 2030.25 0.0 329.0 2359.25
1 V Kohli 2050.50 0.0 253.0 2303.50
2 RG Sharma 1852.00 45.0 266.0 2163.00
3 MS Dhoni 1642.75 0.0 378.0 2020.75
4 DA Warner 1841.75 0.0 178.0 2019.75
5 AB de Villiers 1697.50 0.0 314.0 2011.50
6 CH Gayle 1865.00 41.0 78.0 1984.00
7 RV Uthappa 1663.50 0.0 302.0 1965.50
8 S Dhawan 1717.00 0.0 214.0 1931.00
9 SR Watson 1408.50 240.5 124.0 1773.00
10 KD Karthik 1346.00 0.0 384.0 1730.00
11 G Gambhir 1548.75 0.0 98.0 1646.75
12 AM Rahane 1415.00 0.0 178.0 1593.00
13 YK Pathan 1215.25 94.5 143.0 1452.75
14 KA Pollard 1050.00 121.5 246.0 1417.50
15 AT Rayudu 1228.50 0.0 181.0 1409.50
16 PA Patel 1054.50 0.0 247.0 1301.50
17 BB McCullum 1124.75 0.0 131.0 1255.75
18 Yuvraj Singh 1047.25 98.0 103.0 1248.25
19 MK Pandey 1030.50 0.0 217.0 1247.50
In [36]:
trace1 = go.Bar(
    x=best_player['player'],
    y=best_player['total_points_x'],
    name='Batting points',opacity=0.8,
    marker=dict(color='lightblue'))

trace2 = go.Bar(
    x=best_player['player'],
    y=best_player['total_points_y'],
    name='Bowling points',opacity=0.7,
    marker=dict(color='gold'))

trace3 = go.Bar(
    x=best_player['player'],
    y=best_player['total_points'],
    name='Fielding points',opacity=0.7,
    marker=dict(color='lightgreen'))


data = [trace1, trace2, trace3]
layout = go.Layout(title="Points Distribution of Top Players",barmode='stack',xaxis = dict(tickmode='linear'),
                                    yaxis = dict(title= "Points Distribution"))

fig = go.Figure(data=data, layout=layout)
fig.show()

Feature Engineering

In [37]:
matches_model = matches.copy()
matches_model.head()
Out[37]:
id season city date team1 team2 toss_winner toss_decision result dl_applied winner win_by_runs win_by_wickets player_of_match venue umpire1 umpire2
0 1 2017 Hyderabad 2017-04-05 Sunrisers Hyderabad Royal Challengers Bangalore Royal Challengers Bangalore field normal 0 Sunrisers Hyderabad 35 0 Yuvraj Singh Rajiv Gandhi International Stadium, Uppal AY Dandekar NJ Llong
1 2 2017 Pune 2017-04-06 Mumbai Indians Rising Pune Supergiants Rising Pune Supergiants field normal 0 Rising Pune Supergiants 0 7 SPD Smith Maharashtra Cricket Association Stadium A Nand Kishore S Ravi
2 3 2017 Rajkot 2017-04-07 Gujarat Lions Kolkata Knight Riders Kolkata Knight Riders field normal 0 Kolkata Knight Riders 0 10 CA Lynn Saurashtra Cricket Association Stadium Nitin Menon CK Nandan
3 4 2017 Indore 2017-04-08 Rising Pune Supergiants Kings XI Punjab Kings XI Punjab field normal 0 Kings XI Punjab 0 6 GJ Maxwell Holkar Cricket Stadium AK Chaudhary C Shamshuddin
4 5 2017 Bangalore 2017-04-08 Royal Challengers Bangalore Delhi Daredevils Royal Challengers Bangalore bat normal 0 Royal Challengers Bangalore 15 0 KM Jadhav M Chinnaswamy Stadium S Ravi VK Sharma
In [38]:
matches_model['date'] = pd.to_datetime(matches_model['date'])
matches_model["WeekDay"] = matches_model["date"].dt.weekday
In [39]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
for col in ['city','team1','team2','toss_winner','winner','toss_decision','result','venue']:
    encoder.fit(matches_model[col])
    matches_model[col] = encoder.transform(matches_model[col])
In [40]:
matches_model.head()
Out[40]:
id season city date team1 team2 toss_winner toss_decision result dl_applied winner win_by_runs win_by_wickets player_of_match venue umpire1 umpire2 WeekDay
0 1 2017 15 2017-04-05 12 11 11 1 1 0 13 35 0 Yuvraj Singh 28 AY Dandekar NJ Llong 2
1 2 2017 27 2017-04-06 7 10 10 1 1 0 11 0 7 SPD Smith 21 A Nand Kishore S Ravi 3
2 3 2017 29 2017-04-07 3 6 6 1 1 0 6 0 10 CA Lynn 31 Nitin Menon CK Nandan 4
3 4 2017 16 2017-04-08 10 4 4 1 1 0 4 0 6 GJ Maxwell 13 AK Chaudhary C Shamshuddin 5
4 5 2017 2 2017-04-08 11 2 11 0 1 0 12 15 0 KM Jadhav 17 S Ravi VK Sharma 5

Modeling

In [41]:
# Some of the matched are affected by rain where Duckworth-Lewis method applies. 
# Leaving those matches, let's try to predict.

matches_model = matches_model[matches_model["dl_applied"]==0]
matches_model.shape
Out[41]:
(737, 18)
In [42]:
matches_model.columns
Out[42]:
Index(['id', 'season', 'city', 'date', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'result', 'dl_applied', 'winner', 'win_by_runs',
       'win_by_wickets', 'player_of_match', 'venue', 'umpire1', 'umpire2',
       'WeekDay'],
      dtype='object')
In [43]:
X = matches_model[['season', 'city', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'result', 'win_by_runs','win_by_wickets', 'venue','WeekDay']]
y = matches_model['winner']
In [44]:
print(X.shape, y.shape)
(737, 11) (737,)
In [45]:
from sklearn.model_selection import train_test_split

# Split into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train, y_train)
y_train_preds = rf.predict(X_train)

print("Training accuracy score ", accuracy_score(y_train, y_train_preds))
print("Training Data Confusion Matrix")
print(confusion_matrix(y_train, y_train_preds))

print("Testing Data Confusion Matrix")
print("Testing accuarcy score ", accuracy_score(y_test, rf.predict(X_test)))
print(confusion_matrix(y_test, rf.predict(X_test)))
Training accuracy score  1.0
Training Data Confusion Matrix
[[78  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 24  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 61  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 10  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 62  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  3  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 69  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 92  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  4  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 11  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 55  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 11  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 62  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 47]]
Testing Data Confusion Matrix
Testing accuarcy score  0.8513513513513513
[[20  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  4  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 13  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  1  2  0  0  0  0  0  0  0  0  0]
 [ 0  0  1  0 13  0  2  1  0  1  0  0  0]
 [ 0  0  0  0  1  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 17  1  0  1  0  0  0]
 [ 0  0  0  0  1  0  0 14  0  1  0  1  0]
 [ 0  0  0  0  0  0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0 17  1  0  0]
 [ 0  0  0  0  0  0  0  1  0  0  1  0  0]
 [ 1  0  0  0  0  0  0  0  0  0  0 18  0]
 [ 0  0  0  0  0  0  0  0  0  1  0  1  7]]
In [ ]: